/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.net.urlnormalizer.regex;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

/**
 * Allows users to do regex substitutions on all/any URLs that are encountered,
 * which is useful for stripping session IDs from URLs.
 * 
 * <p>
 * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be
 * set to the file name of an xml file which should contain the patterns and
 * substitutions to be done on encountered URLs.
 * </p>
 * <p>
 * This class also supports different rules depending on the scope. Please see
 * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
 * </p>
 * 
 * @author Luke Baker
 * @author Andrzej Bialecki
 */
public class RegexURLNormalizer extends Configured implements URLNormalizer {

	private static final Logger LOG = LoggerFactory
			.getLogger(RegexURLNormalizer.class);

	/**
	 * Class which holds a compiled pattern and its corresponding substition
	 * string.
	 */
	private static class Rule {
		public Pattern pattern;

		public String substitution;
	}

	private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() {
		protected java.util.HashMap<String, java.util.List<Rule>> initialValue() {
			return new HashMap<String, List<Rule>>();
		};
	};

	public HashMap<String, List<Rule>> getScopedRules() {
		return scopedRulesThreadLocal.get();
	}

	private List<Rule> defaultRules;

	private static final List<Rule> EMPTY_RULES = Collections.emptyList();

	/**
	 * The default constructor which is called from UrlNormalizerFactory
	 * (normalizerClass.newInstance()) in method: getNormalizer()*
	 */
	public RegexURLNormalizer() {
		super(null);
	}

	public RegexURLNormalizer(Configuration conf) {
		super(conf);
	}

	/**
	 * Constructor which can be passed the file name, so it doesn't look in the
	 * configuration files for it.
	 */
	public RegexURLNormalizer(Configuration conf, String filename)
			throws IOException, PatternSyntaxException {
		super(conf);
		List<Rule> rules = readConfigurationFile(filename);
		if (rules != null) {
			defaultRules = rules;
		}
	}

	public void setConf(Configuration conf) {
		super.setConf(conf);
		if (conf == null)
			return;
		// the default constructor was called

		String filename = getConf().get("urlnormalizer.regex.file");
		String stringRules = getConf().get("urlnormalizer.regex.rules");
		Reader reader = null;
		if (stringRules != null) {
			reader = new StringReader(stringRules);
		} else {
			reader = getConf().getConfResourceAsReader(filename);
		}
		List<Rule> rules = null;
		if (reader == null) {
			LOG.warn("Can't load the default rules! ");
			rules = EMPTY_RULES;
		} else {
			try {
				rules = readConfiguration(reader);
			} catch (Exception e) {
				LOG.warn("Couldn't read default config: " + e);
				rules = EMPTY_RULES;
			}
		}
		defaultRules = rules;
	}

	// used in JUnit test.
	void setConfiguration(Reader reader, String scope) {
		List<Rule> rules = readConfiguration(reader);
		getScopedRules().put(scope, rules);
		LOG.debug("Set config for scope '" + scope + "': " + rules.size()
				+ " rules.");
	}

	/**
	 * This function does the replacements by iterating through all the regex
	 * patterns. It accepts a string url as input and returns the altered
	 * string.
	 */
	public String regexNormalize(String urlString, String scope) {
		HashMap<String, List<Rule>> scopedRules = getScopedRules();
		List<Rule> curRules = scopedRules.get(scope);
		if (curRules == null) {
			// try to populate
			String configFile = getConf()
					.get("urlnormalizer.regex.file." + scope);
			if (configFile != null) {
				LOG.debug("resource for scope '" + scope + "': " + configFile);
				try {
					Reader reader = getConf()
							.getConfResourceAsReader(configFile);
					curRules = readConfiguration(reader);
					scopedRules.put(scope, curRules);
				} catch (Exception e) {
					LOG.warn("Couldn't load resource '" + configFile + "': "
							+ e);
				}
			}
			if (curRules == EMPTY_RULES || curRules == null) {
				LOG.info("can't find rules for scope '" + scope
						+ "', using default");
				scopedRules.put(scope, EMPTY_RULES);
			}
		}
		if (curRules == EMPTY_RULES || curRules == null) {
			curRules = defaultRules;
		}
		Iterator<Rule> i = curRules.iterator();
		while (i.hasNext()) {
			Rule r = (Rule) i.next();

			Matcher matcher = r.pattern.matcher(urlString);

			urlString = matcher.replaceAll(r.substitution);
		}
		return urlString;
	}

	public String normalize(String urlString, String scope)
			throws MalformedURLException {
		return regexNormalize(urlString, scope);
	}

	/** Reads the configuration file and populates a List of Rules. */
	private List<Rule> readConfigurationFile(String filename) {
		if (LOG.isInfoEnabled()) {
			LOG.info("loading " + filename);
		}
		try {
			FileReader reader = new FileReader(filename);
			return readConfiguration(reader);
		} catch (Exception e) {
			LOG.error("Error loading rules from '" + filename + "': " + e);
			return EMPTY_RULES;
		}
	}

	private List<Rule> readConfiguration(Reader reader) {
		List<Rule> rules = new ArrayList<Rule>();
		try {

			// borrowed heavily from code in Configuration.java
			Document doc = DocumentBuilderFactory.newInstance()
					.newDocumentBuilder().parse(new InputSource(reader));
			Element root = doc.getDocumentElement();
			if ((!"regex-normalize".equals(root.getTagName()))
					&& (LOG.isErrorEnabled())) {
				LOG.error(
						"bad conf file: top-level element not <regex-normalize>");
			}
			NodeList regexes = root.getChildNodes();
			for (int i = 0; i < regexes.getLength(); i++) {
				Node regexNode = regexes.item(i);
				if (!(regexNode instanceof Element))
					continue;
				Element regex = (Element) regexNode;
				if ((!"regex".equals(regex.getTagName()))
						&& (LOG.isWarnEnabled())) {
					LOG.warn("bad conf file: element not <regex>");
				}
				NodeList fields = regex.getChildNodes();
				String patternValue = null;
				String subValue = null;
				for (int j = 0; j < fields.getLength(); j++) {
					Node fieldNode = fields.item(j);
					if (!(fieldNode instanceof Element))
						continue;
					Element field = (Element) fieldNode;
					if ("pattern".equals(field.getTagName())
							&& field.hasChildNodes())
						patternValue = ((Text) field.getFirstChild()).getData();
					if ("substitution".equals(field.getTagName())
							&& field.hasChildNodes())
						subValue = ((Text) field.getFirstChild()).getData();
					if (!field.hasChildNodes())
						subValue = "";
				}
				if (patternValue != null && subValue != null) {
					Rule rule = new Rule();
					try {
						rule.pattern = Pattern.compile(patternValue);
					} catch (PatternSyntaxException e) {
						if (LOG.isErrorEnabled()) {
							LOG.error("skipped rule: " + patternValue + " -> "
									+ subValue
									+ " : invalid regular expression pattern: "
									+ e);
						}
						continue;
					}
					rule.substitution = subValue;
					rules.add(rule);
				}
			}
		} catch (Exception e) {
			if (LOG.isErrorEnabled()) {
				LOG.error("error parsing conf file: " + e);
			}
			return EMPTY_RULES;
		}
		if (rules.size() == 0)
			return EMPTY_RULES;
		return rules;
	}

	/**
	 * Spits out patterns and substitutions that are in the configuration file.
	 */
	public static void main(String args[])
			throws PatternSyntaxException, IOException {
		RegexURLNormalizer normalizer = new RegexURLNormalizer();
		normalizer.setConf(NutchConfiguration.create());
		HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
		Iterator<Rule> i = normalizer.defaultRules.iterator();
		System.out.println("* Rules for 'DEFAULT' scope:");
		while (i.hasNext()) {
			Rule r = i.next();
			System.out.print("  " + r.pattern.pattern() + " -> ");
			System.out.println(r.substitution);
		}
		// load the scope
		if (args.length > 1) {
			normalizer.normalize("http://test.com", args[1]);
		}
		if (scopedRules.size() > 1) {
			Iterator<String> it = scopedRules.keySet().iterator();
			while (it.hasNext()) {
				String scope = it.next();
				if (URLNormalizers.SCOPE_DEFAULT.equals(scope))
					continue;
				System.out.println("* Rules for '" + scope + "' scope:");
				i = ((List<Rule>) scopedRules.get(scope)).iterator();
				while (i.hasNext()) {
					Rule r = (Rule) i.next();
					System.out.print("  " + r.pattern.pattern() + " -> ");
					System.out.println(r.substitution);
				}
			}
		}
		if (args.length > 0) {
			System.out.println("\n---------- Normalizer test -----------");
			String scope = URLNormalizers.SCOPE_DEFAULT;
			if (args.length > 1)
				scope = args[1];
			System.out.println("Scope: " + scope);
			System.out.println("Input url:  '" + args[0] + "'");
			System.out.println("Output url: '"
					+ normalizer.normalize(args[0], scope) + "'");
		}
		System.exit(0);
	}

}
